In [37]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy
import json
import re
from bokeh.charts import Scatter, Bar, output_notebook, show
from bokeh.plotting import *
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
# from mpl_toolkits.basemap import Basemap
In [3]:
with open('data_links.txt','r+') as f:
    urls = f.readlines()
urls = [x.strip('\n') for x in urls]

def inUS(city):
    if any([city == 'Phoenix',city== 'Pittsburgh',city == 'Charlotte',city == 'Urbana-Champaign',city=='Las Vegas',city=='Madison']):
        return True
    else:
        return False
def replaceCities(city):
    switcher = {
        'Phoenix': "Phoenix",
        'Pittsburgh': "Pittsburgh",
        'Charlotte': "Charlotte",
        'Urbana-Champaign': "Urbana-Champaign"
    }
    return switcher.get(city, "nothing")
def isPrefferedCusine(cuisine):
            if 'Restaurants' in cuisine and any(['American' in cuisine , 'Indian' in cuisine,'Chinese' in cuisine ,'Japanese' in cuisine ,'Middle Eastern' in cuisine,'Mexican' in cuisine]):
                return True
            else:
                return False
def replaceCategory(cuisine):
    if 'American' in cuisine:
        cuisine ='American' 
    if 'Indian' in cuisine:
        cuisine ='Indian'
    if 'Chinese' in cuisine:
        cuisine ='Chinese'
    if 'Japanese' in cuisine:
        cuisine ='Japanese'
    if 'Middle Eastern' in cuisine:
        cuisine ='Middle Eastern'
    if 'Mexican' in cuisine:
        cuisine ='Mexican'
    return cuisine
In [4]:
url_bus = urls[0]
dataset = urlopen(url_bus)
data=[]
for line in dataset:
    data.append(json.loads(line))

df_bus = pd.DataFrame(data)
df_bus = df_bus[df_bus.city.apply(inUS) & df_bus.categories.apply(isPrefferedCusine)]
df_bus.categories= df_bus.categories.apply(replaceCategory)
# df_bus['cuisine'] = df_bus['categories'].map(lambda x: re.match('^(\d+)', x).groups()[0])
In [5]:
url_rev =  urls[1]
dataset = urlopen(url_rev)
data=[]
for line in dataset:
    data.append(json.loads(line))

df_rev = pd.DataFrame(data)
In [6]:
output_notebook()
p = figure(title="Ratings over Review Count")
p.scatter(df_bus['review_count'], df_bus['stars'], marker="circle",
            line_color="firebrick", fill_color="blue", fill_alpha=0.5, size=12)
p.xaxis.axis_label = "Number of Reviews"
p.yaxis.axis_label = "Rating"
show(p)
BokehJS successfully loaded.
In [7]:
url_check =  urls[2]
dataset = urlopen(url_check)
data=[]
for line in dataset:
    data.append(json.loads(line))

df_check = pd.DataFrame(data)
In [8]:
url_user =  urls[3]
dataset = urlopen(url_user)
data=[]
for line in dataset:
    data.append(json.loads(line))

df_user = pd.DataFrame(data)
In [9]:
df_merge = pd.merge(df_rev,df_bus, on='business_id', how='inner')
In [10]:
useful_votes=[]
stars=[]
count = 90000
for i in df_rev.votes.apply(lambda val: dict(val).get('useful')):
    useful_votes.append(i)
for i in df_rev['stars']:
    stars.append(i) 

output_notebook()
p = figure(title="Ratings over 'Useful' Vote Count")
p.scatter(useful_votes[:count], stars[:count], marker="circle",
            line_color="firebrick", fill_color="blue", fill_alpha=0.5, size=12)
p.xaxis.axis_label = "Number of 'Useful' Votes"
p.yaxis.axis_label = "Rating"
show(p)
BokehJS successfully loaded.
In [11]:
df_merge['year'] = df_merge['date'].map(lambda x: re.match('^(\d+)', x).groups()[0])
In [12]:
output_notebook()

bar1 = Bar(df_bus, label='categories', values='stars', group='city',agg='mean',
        title="Avg Star Rating for each Cuisine", legend='top_left')
show(bar1)
BokehJS successfully loaded.
/usr/local/lib/python2.7/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)
In [13]:
output_notebook()

bar = Bar(df_merge[(df_merge['year']=='2012')|(df_merge['year']=='2013') | (df_merge['year']=='2014')], label='categories', values='stars_y', group='year',agg='mean',
        title="Avg Star Rating for each Cuisine over years", legend='top_left',bar_width=0.4)
show(bar)
BokehJS successfully loaded.
In [14]:
output_notebook()

bar = Bar(df_merge[(df_merge['year']=='2012')|(df_merge['year']=='2013') | (df_merge['year']=='2014')], label='categories', values='review_count', group='year',agg='count',
        title="Total Review Count for each Cuisine over years", legend='top_left',bar_width=0.4)
show(bar)
BokehJS successfully loaded.
In [95]:
output_notebook()

bar = Bar(df_merge, label='stars_y', values='review_count', group='categories',agg='count',
        title="Ratings of Cuisines", xlabel="Ratings", ylabel="No. of Reviews", legend='top_left',bar_width=0.4)
show(bar)
BokehJS successfully loaded.
In [97]:
output_notebook()

bar = Bar(df_merge, label='categories', values='business_id',agg='count',group='city',legend='top_left',
        title="Total Number of Restaurants or each Cuisine in the Country",ylabel="Total Business", xlabel="Cuisines", bar_width=0.4)
show(bar)
BokehJS successfully loaded.
In [ ]:
df_merge2 = pd.merge(df_check,df_bus, on='business_id', how='inner')
for chkin in df_merge2.head():
    for k,v in chkin.checkin_info.iteritems():
        print k,v
In [80]:
import matplotlib.image as mpimg
text=''

for i in df_merge.text[df_merge['categories']=='Japanese']:
    text+=re.sub(r"\d+\. ", "", i)
stopwords = STOPWORDS.copy()
wc = WordCloud(max_font_size=40,stopwords=stopwords, margin=10,
               random_state=1).generate(text)
default_colors = wc.to_array()
plt.title("Custom colors")
plt.imshow(wc.recolor())
wc.to_file("Japanese.png")
plt.axis("off")
plt.figure()
plt.title("Default colors")
plt.imshow(default_colors)
plt.axis("off")
plt.show()
from IPython.display import Image
Image(filename='Japanese.png') 
Out[80]: